Part 1: Reading in the Data

First, we need to read in the data and join the training and test data.

train_data <- read_csv("reddit_stress_data/dreaddit-train.csv")
## Rows: 2838 Columns: 116
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (4): subreddit, post_id, sentence_range, text
## dbl (112): id, label, confidence, social_timestamp, social_karma, syntax_ari...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test_data <- read_csv("reddit_stress_data/dreaddit-test.csv")
## Rows: 715 Columns: 116
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (4): subreddit, post_id, sentence_range, text
## dbl (112): id, label, confidence, social_timestamp, social_karma, syntax_ari...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
reddit_stress_data <- add_row(train_data, test_data)

Now we need to find the word distributions. We’ll start by unnesting the tokens and training this on the full dataset.

words_tokenized <- reddit_stress_data %>%
  select(c("id", "text", "label", "subreddit")) %>%
  unnest_tokens(word, text) %>%
  mutate(word = gsub('[[:punct:]]+','', word)) %>%
  mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
  anti_join(stop_words)
## Joining, by = "word"
words_tokenized_test <- test_data %>%
  select(c("id", "text", "label", "subreddit")) %>%
  unnest_tokens(word, text) %>%
  mutate(word = gsub('[[:punct:]]+','', word)) %>%
  mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
  anti_join(stop_words)
## Joining, by = "word"
words_tokenized_train <- train_data %>%
  select(c("id", "text", "label", "subreddit")) %>%
  unnest_tokens(word, text) %>%
  mutate(word = gsub('[[:punct:]]+','', word)) %>%
  mutate(word = gsub('\\<[[:digit:]]+\\>', '%d%', word)) %>%
  anti_join(stop_words)
## Joining, by = "word"

Part 2: Understanding the General Structure of the Data

Getting the Labels Distribution

label_counts <- reddit_stress_data %>%
  group_by(label) %>%
  count()
plot_ly(label_counts, x = ~label, y = ~n, type = "bar")

Subreddit Distribution

subreddit_counts <- reddit_stress_data %>%
  group_by(subreddit) %>%
  count()
plot_ly(subreddit_counts, x = ~subreddit, y = ~n, kind = "bar")
## No trace type specified:
##   Based on info supplied, a 'bar' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#bar
## Warning: 'bar' objects don't have these attributes: 'kind'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'selectedpoints', 'hoverinfo', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'x0', 'dx', 'y', 'y0', 'dy', 'xperiod', 'yperiod', 'xperiod0', 'yperiod0', 'xperiodalignment', 'yperiodalignment', 'text', 'texttemplate', 'hovertext', 'hovertemplate', 'textposition', 'insidetextanchor', 'textangle', 'textfont', 'insidetextfont', 'outsidetextfont', 'constraintext', 'cliponaxis', 'orientation', 'base', 'offset', 'width', 'marker', 'offsetgroup', 'alignmentgroup', 'selected', 'unselected', 'r', 't', '_deprecated', 'error_x', 'error_y', 'xcalendar', 'ycalendar', 'xaxis', 'yaxis', 'idssrc', 'customdatasrc', 'metasrc', 'hoverinfosrc', 'xsrc', 'ysrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'basesrc', 'offsetsrc', 'widthsrc', 'rsrc', 'tsrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'

Labels By Subreddit

ggplot(reddit_stress_data, aes(y = label), height = 100, width = 50) + geom_bar(fill = "steelblue") + labs(title = "Labels by Subreddit") + facet_grid(label ~ subreddit)

Part 3: Visualizing the Most Common Words Among the Dataset

First, to get a better idea of what the overall word distribution is, let’s plot all of the word frequencies.

my_top_word_counts <- words_tokenized %>%
    count(word) %>%
    arrange(desc(n))
ggplot(my_top_word_counts, aes(x = word, y = n)) + geom_col()

Now let’s see the most common words among the data (overall).

GetTopNMostCommonWords <- function(df, num) {
  top_word_counts <- df %>%
    count(word) %>%
    arrange(desc(n))
  return (head(top_word_counts, num))
}
num <- 20
top_10_full_data <- GetTopNMostCommonWords(words_tokenized, num)

Now I will plot the rop 20 most common words in the dataset

ggplot(top_10_full_data, aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Full Dataset", x = "Word", y = "Frequency")

Now let’s see how this varies among label: stressed or non-stressed.

stressed_data <- filter(words_tokenized, label == 0)
non_stressed_data <- filter(words_tokenized, label == 1)

Now let’s plot them

ggplot(GetTopNMostCommonWords(non_stressed_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Non-Stressed Dataset", x = "Word", y = "Frequency")

Now let’s see the difference among stressed data.

ggplot(GetTopNMostCommonWords(stressed_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Stressed Dataset", x = "Word", y = "Frequency")

# Exploring Differences in Subreddit Data Now we’re going to examine the differences by subreddit. First, we will see what unique subreddits have been selected. For each subreddit, I want to examine the difference between the labels and the different words among each label.

unique(reddit_stress_data$subreddit)
##  [1] "ptsd"             "assistance"       "relationships"    "survivorsofabuse"
##  [5] "domesticviolence" "anxiety"          "homeless"         "stress"          
##  [9] "almosthomeless"   "food_pantry"

I’m interested in understanding how the data is distributed among each of these subreddits.

ggplot(reddit_stress_data, aes(x = subreddit)) + geom_bar(fill = "steelblue")

PTSD Subreddit

Now, let’s check out the ptsd subreddit.

ptsd_data <- filter(words_tokenized, subreddit == "ptsd")
ggplot(filter(reddit_stress_data, subreddit == "ptsd"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "PTSD Data by Label")

ggplot(GetTopNMostCommonWords(ptsd_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the PTSD Subreddit", x = "Word", y = "Frequency")

## Domestic Violence Subreddit Now, let’s check out the domestic violence subreddit.

domestic_violence_data <- filter(words_tokenized, subreddit == "domesticviolence")
ggplot(filter(reddit_stress_data, subreddit == "domesticviolence"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "Domestic Violence Data by Label")

ggplot(GetTopNMostCommonWords(domestic_violence_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Domestic Violence Subreddit", x = "Word", y = "Frequency")

## Almost Homeless Subreddit Now let’s check out the almost homeless subreddit

almost_homeless_data <- filter(words_tokenized, subreddit == "almosthomeless")
ggplot(filter(reddit_stress_data, subreddit == "almosthomeless"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "Almost Homeless Data by Label")

ggplot(GetTopNMostCommonWords(almost_homeless_data, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 10 Words from the Almost Homeless Subreddit", x = "Word", y = "Frequency")

## Assistance Subreddit

assistance_subreddit <- filter(words_tokenized, subreddit == "assistance")
ggplot(filter(reddit_stress_data, subreddit == "assistance"), aes(x = label)) + geom_bar(fill = "steelblue") + labs(title = "Assistance Data by Label")

ggplot(GetTopNMostCommonWords(assistance_subreddit, num), aes(x = reorder(word, desc(n)), y = n)) + geom_col(fill = "steelblue") + labs(title = "Top 20 Words from the Assistance Subreddit", x = "Word", y = "Frequency")

Part 4: Visualizing the Distribution of Sentiment

Overall

ggplot(reddit_stress_data, aes(x = sentiment)) + geom_histogram(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment")

## By Label

ggplot(reddit_stress_data, aes(x = sentiment)) + geom_histogram(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_wrap(~ label)

## By Subreddit

ggplot(reddit_stress_data, aes(x = sentiment)) + geom_histogram(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_wrap(~ subreddit)

By Label and Subreddit

ggplot(reddit_stress_data, aes(x = sentiment)) + geom_histogram(fill = "steelblue", bins = 50) + labs(title = "Distribution of Sentiment") + facet_grid(subreddit ~ label)